# Discretion in Clinical Decision-Making
# Claire Boone, March 2025
# Clean raw EHR data
# No tables or figures created

# set up -----------------
  library(here)
  library(tidyverse)
  library(dplyr)
  library(panelr)
  library(reshape2)
  library(lubridate)

  proj_dir <- here::here()
  raw_data <- file.path(proj_dir, "raw_data/")
  gen_data <- file.path(proj_dir, "gen_data/")
  out <- file.path(proj_dir, "out/")

# load data ---------------------------
  load(paste0(raw_data, "raw_smsdata.Rda"))
  hd <- raw_smsdata

# initial cleaning --------------------------- 
  hd <- data.frame(lapply(hd, function(v) {
    if (is.character(v)) return(tolower(v))
    else return(v)
  }))

  if (is.factor(hd$cod_est)) hd$cod_est <- as.numeric(as.character(hd$cod_est))

  hd$male <- NA  
  hd$male[hd$sexo == 2] <- 1  
  hd$male[hd$sexo == 4] <- 0  

  hd$insurance <- as.factor(hd$seguro)
  hd$insurance[hd$insurance == "no informado (ignorado)"] <- NA
  fonasa_levels <- c("fonasa a", "fonasa b", "fonasa c", "fonasa d")
  hd$isapre <- ifelse(!hd$insurance %in% fonasa_levels & !is.na(hd$insurance), 1, 0)
  hd$fonasa <- factor(match(hd$insurance, fonasa_levels), 
                       labels = c("Fonasa A", "Fonasa B", "Fonasa C", "Fonasa D"))

  hd$fecha_atencion <- dmy(hd$fecha_atencion)
  hd$fecha_prox_control <- dmy(hd$fecha_prox_control)

  if (any(grepl("/", hd$pa_mmhg, fixed = TRUE))) {
    pa <- tidyr::separate(hd, col = pa_mmhg, into = c("pa_sys", "pa_dia"), sep = "/", convert = TRUE)
    hd$pa_sys <- pa$pa_sys
    hd$pa_dia <- pa$pa_dia
  }

  hd$riesgo_cvnew <- as.numeric(hd$riesgo_cv)

  hd$alc_prob <- ifelse(hd$alcohol %in% c("posible consumo problema o dependencia", "dependencia al alcohol"), 1, 0)

  hd$colesterol_total_mgdl <- pmax(pmin(hd$colesterol_total_mgdl, 500), 50, na.rm = TRUE)
  hd$hdl_mgdl <- pmax(pmin(hd$hdl_mgdl, 200), 1, na.rm = TRUE)
  hd$trigliceridos_mgdl <- pmax(pmin(hd$trigliceridos_mgdl, 5000), 10, na.rm = TRUE)

  hd$tabaco <- ifelse(hd$tabaco %in% c(2, 5), 1, 0)

  x <- reshape2::colsplit(hd$edad_actual, pattern = " ", names = c("year", "anos", "mon", "meses", "day", "dias"))
  hd$dob <- dmy("01-04-2019") - years(as.numeric(x$year)) - months(as.numeric(x$mon)) - days(as.numeric(x$day))
  hd$age_at_visit <- as.numeric(difftime(hd$fecha_atencion, hd$dob, units = "weeks")) / 52.25

# save cleaned EHR dataset --------------------------
  hd <- hd %>%
    dplyr::select(id, cod_est, cod_ss, dependencia, tipo_est, edad_actual, male,
                  diag_cv, fecha_atencion, fecha_prox_control, fonasa,
                  fonasa_a, fonasa_b, fonasa_c, fonasa_d, isapre,
                  peso, talla, imc, fondo_ojo, vig_fondo_ojo, 
                  hba1c, glicemia_mgdl, colesterol_total_mgdl, trigliceridos_mgdl, 
                  circ_cintura, estado_hta, estado_dm2, estado_dislip, 
                  hemoglobina, hdl_mgdl, nefropatia, edema, usa_insulina, 
                  ave, fecha_ave, iam, fecha_iam, 
                  retinopatia, fecha_retinopatia, neuropatia, fecha_neuropatia, 
                  hipertrofia_v_izq, fecha_hipertrofia_v_izq, adherencia_dieta, adherencia_meds, 
                  erc, ant_fliar_cardiopatia, ant_lipidos_genetico, 
                  en_tto_med, fecha_inicio_tto, 
                  amputacion_dm2, ldl_mgdl, pro, prof_doc, prof_nur, prof_nut, prof_oth, 
                  prof_tec, prof_kin, profesionaln, alc_no, alc_risk, alc_harm, alc_prob, alc_dep,
                  pa_sys, pa_dia, nut_und, nut_mal, nut_mal2, nut_mega, nut_norm, nut_obese, 
                  nut_ob1, nut_ob2, nut_ob3, nut_obsev, nut_ris, nut_over, risk_low, risk_med, 
                  risk_hi, risk_vhi, riesgo_cvn, cor_risk_low, cor_risk_med, cor_risk_hi, 
                  cor_risk_vhi, diag_htn, diag_dm, diag_dislip, act_fisica, sedentarismo,
                  tabaco, rac, ev_pie_diabetico, dob, age_at_visit, ant_pers_enf_cv)
  
  save(hd, file = paste0(gen_data,"ehr_cleaned.Rda"))
  

  